#!/usr/bin/env python
# encoding=utf8

from bs4 import BeautifulSoup, UnicodeDammit
import sqlite3
import requests
import re
import locale
from time import sleep
import time
from selenium import webdriver
from random import randint

# Important for UnicodeDammit
import sys
reload(sys)
sys.setdefaultencoding("utf-8")


def main():

    database = "m5s_blog_feb2016.sqlite"

    # buildIndex(database) # <- This only on first round
    # buildUpdateIndex(database)
    # removePostsCommentsAfter('2015-02-01', database)
    parseBlog(database)

    return


def cleanHtml(html_doc):
    # Clean html from br
    html_doc = html_doc.replace("<br>", " ")
    html_doc = html_doc.replace("<br />", " ")
    html_doc = html_doc.replace("&#","&amp;#")

    return html_doc


def removePostsCommentsAfter(cut_date, database):

    conn = sqlite3.connect(database)
    conn.text_factory = str
    cursor = conn.cursor()
    cursor.execute("DELETE FROM post WHERE date >= date(?);", (cut_date,))
    result = cursor.rowcount
    print "Rows deleted from 'post': " + str(result)
    cursor.execute("DELETE FROM comment WHERE created >= date(?);", (cut_date,))
    result = cursor.rowcount
    print "Rows deleted from 'commment': " + str(result)
    conn.commit()
    


def buildUpdateIndex(database):

    # Open database connection
    conn = sqlite3.connect(database)
    conn.text_factory = str
    cursor = conn.cursor()

    # Define variables
    base_url = "http://www.beppegrillo.it/"
    year_month = ["2015/02/","2015/03/","2015/04/","2015/05/","2015/06/","2015/07/","2015/08/","2015/09/","2015/10/","2015/11/","2015/12/","2016/01/","2016/02/"]

    for pair in year_month:
        url = base_url + pair
        html_doc = requests.get(url)
        soup = BeautifulSoup(html_doc.text)
        print "Requesting " + url + "..."
        resetIndex(pair, cursor)
        conn.commit()
        parseMonthArchive(soup, url, cursor)
        sleep(randint(20,40))

    conn.commit()

    return

def resetIndex(pair, cursor):

    cursor.execute("""
    DELETE FROM [index] WHERE url LIKE ('http://www.beppegrillo.it/' || ? || '%');
    """, (pair,))
    result = cursor.rowcount
    print "Urls deleted from 'index': " + str(result)

    
def buildIndex(database):

    # Open database connection
    conn = sqlite3.connect(database)
    conn.text_factory = str
    cursor = conn.cursor()

    # Define variables
    base_url = "http://www.beppegrillo.it/"
    first_year = 2005
    n_year = 10
    months = ["01","02","03","04","05","06","07","08","09","10","11","12"]

    for year in range(first_year, first_year + n_year):
        for month in months:
            url = base_url + str(year) + "/" + month + "/"
            html_doc = requests.get(url)
            soup = BeautifulSoup(html_doc.text)
            print "Requesting " + url + "..."
            parseMonthArchive(soup, url, cursor)
            sleep(randint(20,40))

    conn.commit()

    return

def findPageUrls(cursor):

    cursor.execute("SELECT url FROM [index] WHERE status IS NOT 'OK'")

    urls = []
    
    for (url,) in cursor.fetchall():
        urls.append(url)
        
    return urls    

def findCommentPages(soup):

    link_list = []

    try:
        for link in soup.find(id="capages").find_all("a", href=True):
            link_list.append(link['href'])
    except Exception, err:
        sys.stderr.write('ERROR: %s\n' % str(err))
        pass
    
    return link_list


def parseMonthArchive(soup, url, cursor):

    for link in soup.find_all("a", href=True):
        link = link['href']
        if re.match(url, link) is not None:
            index = {}
            index['link'] = link
            enterIndex(index, cursor)

    return

def parseBlog(database):

    # Open database connection
    conn = sqlite3.connect(database)
    conn.text_factory = str
    cursor = conn.cursor()

    urls = findPageUrls(cursor)
    remaining_pages = len(urls)

    for url in urls:
        print "Remaining pages: " + str(remaining_pages)
        print("\n")
        print "Parsing page " + url
        driver = webdriver.PhantomJS()
        try: 
            driver.get(url)
            html_doc = driver.page_source
            driver.quit()
        except Exception,e:
            print str(e)
            driver.quit()
            cursor.execute("UPDATE [index] SET status = 'Error main page' WHERE url = ?", (url,))
            print "Error with main page"
            print("\n")
            sleep(randint(50,100))
            continue
        
        soup = BeautifulSoup(UnicodeDammit.detwingle(html_doc))
        print "Parsing post..."
        try:
            parsePost(soup, url, cursor)
        except Exception,e:
            print str(e)
            sleep(randint(50,100))
            continue
        
        comment_pages = findCommentPages(soup)
        conn.commit()

        print "Parsing comments..."
        sleep(randint(20,40))

        statusComments = requestingCommentPages(comment_pages, url, cursor)

        if statusComments=="OK":
            
            print "OK, page completed."
            print("\n")
            remaining_pages -= 1
            cursor.execute("UPDATE [index] SET status = 'OK' WHERE url = ?", (url,))
            conn.commit()

        else:
            print "Error with comment page"
            print("\n")
            cursor.execute("UPDATE [index] SET status = 'Error comment page' WHERE url = ?", (url,))
            conn.commit()

    return
            
        
def requestingCommentPages(comment_pages, url, cursor):

    driver = webdriver.PhantomJS()
    
    for comment_page in comment_pages:
        print "Parsing comment page " + comment_page

        # Two options for requesting page (uncomment just one)
        try:
            driver.get(comment_page)
            element = driver.find_element_by_xpath("//div[@id='commentsToSort']")
            soup = BeautifulSoup(UnicodeDammit.detwingle(element.get_attribute('innerHTML')))
            parseComment(soup, url, cursor)
            sleep(randint(20,40))
        except Exception,e:
            print str(e)
            sleep(randint(50,100))
            response = "Error"
            return response
        
        # 2
        ## try: 
        ##     html_doc  = requests.get(comment_page).text
        ##     html_doc  = cleanHtml(html_doc)
        ##     soup = BeautifulSoup(UnicodeDammit.detwingle(html_doc))
        ##     parseComment(soup, url, cursor)
        ##     sleep(randint(20,40))
        ## except Exception,e:
        ##     print str(e)
        ##     sleep(randint(50,100))
        ##     response = "Error"
        ##     return response

    response = "OK"
    driver.quit()
    return response


def parsePost(soup, url, cursor):

    post = {}
    post['url'] = url
    post['title'] = soup.find(class_="titolopost").get_text(strip=True).encode('utf-8')

    # Loop over all p elements
    text = ''
    for p in  soup.find("span", "BodyPost").find_all('p'):
        text += p.encode('utf-8')
    post['text'] = text
    
    date = soup.find("p", class_="posted").get_text(strip=True).encode('utf-8').split("|")[0]
    
    # Translate month (%m) from Italian to English
    d = {
        'Gen':'Jan',
        'Feb':'Feb',
        'Mar':'Mar',
        'Apr':'Apr',
        'Mag':'May',
        'Giu':'Jun',
        'Lug':'Jul',
        'Ago':'Aug',
        'Set':'Sep',
        'Ott':'Oct',
        'Nov':'Nov',
        'Dic':'Dec'
    }
    pattern = re.compile(r'\b(' + '|'.join(d.keys()) + r')\b')
    date = pattern.sub(lambda x: d[x.group()], date)
    post['date'] = time.strftime("%Y-%m-%d %H:%M:%S", time.strptime(date, "%d %b %Y,%H:%M"))
    
    enterPost(post, cursor)

    return


def parseComment(soup, post_url, cursor):

    # Set counters for outer loop (i) and inner loop (j)
    i = 0
    j = 0
    
    # for element in soup.find(id="commentsToSort").find_all(): # <- This for some reason dind't work anymore. Bypassing it with selenium
    for element in soup.find_all():

        # Create list to store results
        comment = {}

        # Check 1: is p?
        if element.name == 'p':
            p = element
        else:
            i += 1
            continue
        
        # Check 2: if previous element is also p skip a cycle
        # print "Outer loop: " + str(i)
        try:
            # if soup.find(id="commentsToSort").find_all()[i-1].name == 'p':
            if soup.find_all()[i-1].name == 'p':
                continueOuter = True
            else:
                continueOuter = False
        except:
            pass
        if continueOuter == True:
            # Increment counter    
            i += 1
            continue
        
        text = p.get_text(strip=True).encode('utf-8')
        
        # Check 3: if next element also p append it to string and move to next element
        j = i
        while True:
            # print "Inner loop: " + str(j)
            try:
                # if soup.find(id="commentsToSort").find_all()[j+1].name == "p":
                if soup.find_all()[j+1].name == "p":
                    text += "\n"
                    # print "FLAG 1"
                    # text += soup.find(id="commentsToSort").find_all()[j+1].get_text(strip=True).encode('utf-8')
                    text += soup.find_all()[j+1].get_text(strip=True).encode('utf-8')
                    j += 1
                else:
                    break
            except:
                break

        # Increment counter    
        i += 1

        # Case 1: reply to comment    
        if p.find_next_sibling(class_='comment-posted') is None:
            table = p.find_next_sibling('table')
            name_date = table.find(class_="posted posted-nested").td.get_text()
            dtpattern = re.compile(r'(.*?)\s+(\d{2}\.\d{2}\.(?:\d{2}|\d{4})\s+\d{2}:\d{2})|')
            name, date = dtpattern.search(name_date).groups()
            date = time.strptime(date, "%d.%m.%y %H:%M")
            # print "FLAG 2"
            reply_comment_id = table.find(class_="posted posted-nested").find(class_="vota").encode('utf-8').strip()
            regex_pattern = re.compile('abusecomment(.*?)\"')
            reply_comment_id = regex_pattern.search(reply_comment_id).group(1)

            # Add ids to results
            comment['comment_id'] = reply_comment_id
            comment['replied_to'] = comment_id

        # Case 2: normal comment      
        else:
            name_date = p.find_next_sibling(class_='comment-posted').td.get_text()
            dtpattern = re.compile(r'(.*?)\s+(\d{2}\.\d{2}\.(?:\d{2}|\d{4})\s+\d{2}:\d{2})|')
            name, date = dtpattern.search(name_date).groups()
            date = time.strptime(date, "%d.%m.%y %H:%M")
            # print "FLAG 3"
            comment_id = p.find_next_sibling(class_='comment-posted').find(class_="vota").encode('utf-8').strip()
            regex_pattern = re.compile('votecomment(.*?)\"')
            comment_id = regex_pattern.search(comment_id).group(1)

             # Add ids to results
            comment['comment_id'] = comment_id
            comment['replied_to'] = None

        # Add details common to case 1 and case 2
        comment['text'] = text
        comment['username'] = name
        # print comment['username']
        comment['created'] = time.strftime("%Y-%m-%d %H:%M:%S", date)
        # print comment['created']
        comment['post_url'] = post_url

        enterComment(comment, cursor)

    return


def enterIndex(object, cursor):

    try:
        cursor.execute("INSERT OR IGNORE INTO [index] (url) VALUES (?)", (object['link'],))

    except Exception, err:
        sys.stderr.write('ERROR: %s\n' % str(err))
        return 1  

    return


def enterPost(object, cursor):

    try:
        cursor.execute("INSERT OR IGNORE INTO post (url, title, text, date) VALUES (?, ?, ?, ?)", (object['url'], object['title'], object['text'], object['date']))

    except Exception, err:
        sys.stderr.write('ERROR: %s\n' % str(err))
        return 1   

    return


def enterComment(object, cursor):

    try:
        cursor.execute("INSERT OR IGNORE INTO comment (comment_id, text, username, created, replied_to, post_url) VALUES (?, ?, ?, ?, ?, ?)", (object['comment_id'], object['text'], object['username'], object['created'], object['replied_to'], object['post_url']))

    except Exception, err:
        sys.stderr.write('ERROR: %s\n' % str(err))
        return 1

    return


### Execute
main()
